

clear all
set more off
cap log close

***************************************************************************************************
* Program: 3_other_data_prep.do
* Purpose: 
***************************************************************************************************

************************************************************************
** set global macros
************************************************************************
global root_dir "~/Dropbox/Prison_Covid/Visitation misconduct paper/Science Advances R&R/Replication"
global data_raw "${root_dir}/Data/Raw"
global gas_price_dir "${data_raw}/gas_price"
global compstat_dir "${data_raw}/cdcr_compstat"
global macroecon_data "${data_raw}/macro_economic_data"
global boundaries_dir "${data_raw}/boundaries"
global fire_dir "${data_raw}/california_historic_fire_perimeters_data"
global weather_dir "${data_raw}/weather"
global processed_dir "${root_dir}/Data/Processed"


************************************************************************
* 1. Gas price data in California - weekly level
// https://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=pet&s=emm_epm0_pte_sca_dpg&f=w
************************************************************************

forval yr = 2017(1)2020{
	
	import excel "${gas_price_dir}/EMM_EPM0_PTE_SCA_DPGw.xls", sheet("Data 1") cellrange(A3:B1257) firstrow clear

	gen year = year(Date)
	gen week = week(Date)

	keep if year == `yr'

	rename Date end_date
	rename WeeklyCaliforniaAllGradesAll gas_price

	save "${processed_dir}/GasPrices_WeeklyCaliforniaAllGrades_`yr'.dta", replace 

}

import excel "${gas_price_dir}/EMM_EPM0_PTE_SCA_DPGm.xls", sheet("Data 1") cellrange(A3:B296) firstrow clear

gen year = year(Date)

rename Date mid_date
rename CaliforniaAllGradesAllFormul monthly_gas_price

sort mid_date
gen lag_monthly_gas_price = monthly_gas_price[_n-1]

save "${processed_dir}/GasPrices_MonthlyCaliforniaAllGrades.dta", replace 


****************************************************************
* Clean compstat incident data
****************************************************************

cap program drop clean_compstat
program define clean_compstat

		gen var_to_keep = 0
		replace var_to_keep = 1 if subcategory == "Inmate Appeals" & metrictitle == "Appeals Per 100 Inmates"
		replace var_to_keep = 1 if subcategory == "Number of Incidents" & ///
			inlist(metrictitle, "Per 100 inmates", "Number of Incidents", "Documented Use of Force")
		replace var_to_keep = 1 if subcategory == "Incidents" & metrictitle == "Cell Phone Seizures"
		replace var_to_keep = 1 if subcategory == "Incidents" & index(metrictitle, "Aggravated Battery on ") > 0 
		replace var_to_keep = 1 if subcategory == "Incidents" & metrictitle == "Assault on Inmate (Total)"
		replace var_to_keep = 1 if subcategory == "Incidents" & ///
			inlist(metrictitle,"Assault on a Peace Officer or Non-Prisoner (Total)", "Assault on a Peace Officer or Non-")
		
		replace var_to_keep = 1 if subcategory == "Incidents" & metrictitle == "Battery on Inmate (Total)"
		replace var_to_keep = 1 if subcategory == "Incidents" & ///
			inlist(metrictitle, "Battery on a Peace Officer or Non-Prisoner (Total)", "Battery on a Peace Officer or Non-")
		// drug related variables 
		replace var_to_keep = 1 if subcategory == "Incidents" & inlist(metrictitle, ///
			"Methamphetamine", "Marijuana", "Heroin", "Cocaine", "Amphetamine", "Barbiturates", "Codeine") 
		replace var_to_keep = 1 if subcategory == "Incidents" & inlist(metrictitle, ///
			"Morphine", "Synthetic Marijuana - Spice")
		replace var_to_keep = 1 if subcategory == "Incidents" & index(metrictitle, "ControlledSubstances/Stimulants/Sedatives") > 0
		replace var_to_keep = 1 if subcategory == "Incidents" & inlist(metrictitle, "Controlled", "Other Controlled")
		replace var_to_keep = 1 if metrictitle == "Inmate Placements on CSW"
		replace var_to_keep = 1 if metrictitle == "Count of CSW Items Recovered"
		replace var_to_keep = 1 if subcategory == "Inmate Disciplinaries" & inlist(metrictitle, "Per 100 inmates", "Possession of Cell Phone(s)", "Fighting")
		replace var_to_keep = 1 if subcategory =="Sick Leave"  & index(metrictitle, "Total Hours") 
		replace var_to_keep = 1 if subcategory == "Overtime" & index(metrictitle, "Total")
		replace var_to_keep = 1 if subcategory == "Total Bed Capacity" & index(metrictitle, "Inmate Count") > 0
		replace var_to_keep = 1 if metrictitle == "Inmate Count"
		replace var_to_keep = 1 if index(metrictitle, "Inmate Level") > 0


		keep if var_to_keep == 1

		gen varname = metrictitle
		
		replace varname = "uof_per100" if subcategory == "Number of Incidents" & metrictitle == "Per 100 inmates" & metrictitle[_n-1] == "Documented Use of Force"
		replace varname = "incidents_per100" if subcategory == "Number of Incidents" & metrictitle == "Per 100 inmates" & metrictitle[_n-1] == "Number of Incidents"
		replace varname = "incidents" if subcategory == "Number of Incidents" & metrictitle == "Number of Incidents"
		replace varname = "doc_uof" if subcategory == "Number of Incidents" & metrictitle == "Documented Use of Force"
		
		replace varname = "assault_on_inmate" if subcategory == "Incidents" & metrictitle == "Assault on Inmate (Total)"
		replace varname = "assault_on_offi_nonpri" if subcategory == "Incidents" & ///
			inlist(metrictitle,"Assault on a Peace Officer or Non-Prisoner (Total)", "Assault on a Peace Officer or Non-")
		
		replace varname = "battery_on_inmate" if subcategory == "Incidents" & metrictitle == "Battery on Inmate (Total)"
		replace varname = "battery_on_offi_nonpri" if subcategory == "Incidents" & ///
			inlist(metrictitle, "Battery on a Peace Officer or Non-Prisoner (Total)", "Battery on a Peace Officer or Non-")
			
		replace varname = "agg_battery_nonpri" if subcategory == "Incidents" & metrictitle == "Aggravated Battery on a Non-Prisoner"
		replace varname = "agg_battery_officer" if subcategory == "Incidents" & ///
			inlist(metrictitle, "Aggravated Battery on a PeaceOfficer", "Aggravated Battery on a Peace Officer")
		
		replace varname = "cell_seizures" if metrictitle == "Cell Phone Seizures"
		replace varname = "appeals_per100" if metrictitle == "Appeals Per 100 Inmates"
		
		replace varname = "disciplinaries_per100" if subcategory == "Inmate Disciplinaries" & inlist(metrictitle, "Per 100 inmates")
		replace varname = "disciplinaries_cellphone" if subcategory == "Inmate Disciplinaries" & inlist(metrictitle, "Possession of Cell Phone(s)")
		replace varname = "disciplinaries_fighting" if subcategory == "Inmate Disciplinaries" & inlist(metrictitle, "Fighting")
		
		replace varname = "placement_csw" if metrictitle == "Inmate Placements on CSW"
		replace varname = "count_csw_items" if metrictitle == "Count of CSW Items Recovered"
		
		replace varname = lower(metrictitle) if subcategory == "Incidents" & inlist(metrictitle, ///
			"Methamphetamine", "Marijuana", "Heroin", "Cocaine", "Amphetamine", "Barbiturates", "Codeine", "Morphine") 
	
		replace varname = "syn_marijuana" if subcategory == "Incidents" & metrictitle == "Synthetic Marijuana - Spice"
		replace varname = "ctrl_substances" if subcategory == "Incidents" & inlist(metrictitle, "ControlledSubstances/Stimulants/Sedatives", "Controlled")
		replace varname = "oth_ctrl_substances" if subcategory == "Incidents" & inlist(metrictitle, "Other ControlledSubstances/Stimulants/Sedatives", "Other Controlled")
		
		replace varname = "overtime_hours" if subcategory =="Overtime"  & index(metrictitle, "Total")
		replace varname = "sick_leave_hours" if subcategory =="Sick Leave"  & index(metrictitle, "Total Hours") 
		replace varname = "inmate_count" if subcategory == "Total Bed Capacity" & index(metrictitle, "Inmate Count") > 0
		
		replace varname = "inmate_level_1" if index(metrictitle, "Inmate Level I (Classification") > 0
		replace varname = "inmate_level_2" if index(metrictitle, "Inmate Level II (Classification Score") > 0
		replace varname = "inmate_level_3" if index(metrictitle, "Inmate Level III (Classification Score") > 0
		replace varname = "inmate_level_4" if index(metrictitle, "Inmate Level IV (Classification Score") > 0

			
	
		drop topcategory subcategory metrictitle var_to_keep // topcategory == "Custody Operations" for ALL observations

		rename jan value1
		rename feb value2
		rename mar value3
		rename apr value4
		rename may value5
		rename jun value6
		rename jul value7
		rename aug value8
		rename sep value9
		rename oct value10
		rename nov value11
		rename dec value12

		
		reshape long value, i(institution varname year) j(month)
		destring value, replace 
		 
		reshape wide value, i(institution month year) j(varname) string

		rename value* *
		
		gen battery = battery_on_inmate + battery_on_offi_nonpri
		gen agg_battery = agg_battery_nonpri + agg_battery_officer
		gen assault = assault_on_inmate + assault_on_offi_nonpri
		gen drug = amphetamine + barbiturates + cocaine + codeine + heroin + ///
			marijuana + methamphetamine + morphine + oth_ctrl_substances + syn_marijuana + ctrl_substances 
		keep institution year month appeals_per100 cell_seizures doc_uof incidents incidents_per100 uof_per100 ///
			battery agg_battery assault drug disciplinaries_per100 disciplinaries_cellphone disciplinaries_fighting ///
			assault_on_inmate assault_on_offi_nonpri battery_on_inmate battery_on_offi_nonpri agg_battery_nonpri agg_battery_officer ///
			placement_csw count_csw_items overtime_hours sick_leave_hours inmate_count inmate_level_*
		
		gen incidents_pop = incidents / incidents_per100 * 100
		gen uof_pop = doc_uof / uof_per100 * 100
		gen security4_share = inmate_level_4 / inmate_count 
		
		gen date = 31
		replace date = 30 if inlist(month, 4, 6, 9, 11)
		replace date = 28 if month == 2

		gen end_date = mdy(month, date, year)
		drop date
		format end_date %td
		
		gen mid_date = mdy(month, 15, year)
		format mid_date %td
		
end


forval yr = 2017/2019{
	
	import delimited "${compstat_dir}/finished-2009-2019-everything.csv", clear bindquotes(strict)

	// clean to get only 
	keep if year == `yr'

	//keep if index(subcategory, "Incidents") + index(subcategory, "Violence") + index(subcategory, "Force") > 0
	
	// run program to clean compstat data
	clean_compstat
	
	if `yr' != 2019{
		save "${processed_dir}/cdcr_compstat_incident_`yr'.dta", replace 
	}
	if `yr' == 2019{
		save "${processed_dir}/cdcr_compstat_incident_`yr'_incomplete.dta", replace 
	}	

}

import delimited "${compstat_dir}/2019-DAI_reception_centers.csv", clear bindquotes(strict)
clean_compstat	
save "${processed_dir}/cdcr_compstat_incident_2019_reception_centers.dta", replace 

use "${processed_dir}/cdcr_compstat_incident_2019_incomplete.dta", clear 
append using "${processed_dir}/cdcr_compstat_incident_2019_reception_centers.dta"
save "${processed_dir}/cdcr_compstat_incident_2019.dta", replace 

***************
* get 2020 compstat incident data
***************


import delimited "${compstat_dir}/finished-2020-2023-incidents.csv", clear

keep if year == 2020

gen var_to_keep = 0
replace var_to_keep = 1 if subcategory == "Number of Incidents"

replace var_to_keep = 1 if subcategory == "Incident Categories" & index(metrictitle, "Aggravated Battery on ") > 0 
replace var_to_keep = 1 if subcategory == "Incident Categories" & metrictitle == "Assault on Inmate (Total)"
replace var_to_keep = 1 if subcategory == "Incident Categories" & metrictitle == "Assault on a Peace Officer or Non-Prisoner (Total)"
replace var_to_keep = 1 if subcategory == "Incident Categories" & metrictitle == "Battery on Inmate (Total)"
replace var_to_keep = 1 if subcategory == "Incident Categories" & metrictitle == "Battery on a Peace Officer or Non-Prisoner (Total)"

// drug related variables 
replace var_to_keep = 1 if subcategory == "Controlled Substance" & inlist(metrictitle, ///
	"Methamphetamine", "Marijuana", "Heroin", "Cocaine", "Amphetamine", "Barbiturates", "Codeine") 
replace var_to_keep = 1 if subcategory == "Controlled Substance" & inlist(metrictitle, ///
	"Morphine", "Synthetic Marijuana - Spice")
replace var_to_keep = 1 if subcategory == "Controlled Substance" & metrictitle == "Controlled Substances/Stimulants/Sedatives"
replace var_to_keep = 1 if subcategory == "Controlled Substance" & metrictitle == "Other Controlled Substances/Stimulants/Sedatives"

keep if var_to_keep == 1

gen varname = metrictitle
replace varname = "incidents" if subcategory == "Number of Incidents" & metrictitle == "Number of Incidents"
replace varname = "doc_uof" if subcategory == "Number of Incidents" & metrictitle == "Documented Use of Force"

replace varname = "assault_on_inmate" if subcategory == "Incident Categories" & metrictitle == "Assault on Inmate (Total)"
replace varname = "assault_on_offi_nonpri" if subcategory == "Incident Categories" & metrictitle == "Assault on a Peace Officer or Non-Prisoner (Total)"

replace varname = "battery_on_inmate" if subcategory == "Incident Categories" & metrictitle == "Battery on Inmate (Total)"
replace varname = "battery_on_offi_nonpri" if subcategory == "Incident Categories" & metrictitle == "Battery on a Peace Officer or Non-Prisoner (Total)"
	
replace varname = "agg_battery_nonpri" if subcategory == "Incident Categories" & metrictitle == "Aggravated Battery on a Non-Prisoner"
replace varname = "agg_battery_officer" if subcategory == "Incident Categories" & metrictitle == "Aggravated Battery on a Peace Officer"

replace varname = lower(metrictitle) if subcategory == "Controlled Substance" & inlist(metrictitle, ///
	"Methamphetamine", "Marijuana", "Heroin", "Cocaine", "Amphetamine", "Barbiturates", "Codeine", "Morphine") 

replace varname = "syn_marijuana_spice" if metrictitle == "Synthetic Marijuana - Spice"	
replace varname = "ctrl_substances" if subcategory == "Controlled Substance" & metrictitle == "Controlled Substances/Stimulants/Sedatives"
replace varname = "oth_ctrl_substances" if subcategory == "Controlled Substance" & metrictitle == "Other Controlled Substances/Stimulants/Sedatives"
	

drop subcategory metrictitle var_to_keep // topcategory == "Custody Operations" for ALL observations

destring value, replace  
reshape wide value, i(institution month year) j(varname) string
rename value* *


keep institution month year battery_on_inmate battery_on_offi_nonpri doc_uof incidents marijuana methamphetamine 

gen battery = battery_on_inmate + battery_on_offi_nonpri
keep institution year month doc_uof incidents battery 

replace month = "1" if month == "Jan"
replace month = "2" if month == "Feb"
replace month = "3" if month == "Mar"
replace month = "4" if month == "Apr"
replace month = "5" if month == "May"
replace month = "6" if month == "Jun"
replace month = "7" if month == "Jul"
replace month = "8" if month == "Aug"
replace month = "9" if month == "Sep"
replace month = "10" if month == "Oct"
replace month = "11" if month == "Nov"
replace month = "12" if month == "Dec"

destring month, replace 
gen date = 31
replace date = 30 if inlist(month, 4, 6, 9, 11)
replace date = 28 if month == 2

gen end_date = mdy(month, date, year)
drop date
format end_date %td

gen mid_date = mdy(month, 15, year)
format mid_date %td

save "${processed_dir}/cdcr_compstat_incident_2020.dta", replace 



****************************************************************
* Get all prison temperature data (tmax, tmin, ppt)
* By getting all prisons' census tract
* And merge the tract-level daily temperature data. 
* Source: https://www.openicpsr.org/openicpsr/project/230941/version/V2/view
* National Neighborhood Data Archive (NaNDA): PRISM Climate of Census Tracts, United States, 1981-2024
****************************************************************

use  "${data_raw}/CA_station_prison_address_google_geocoded.dta", clear
geoinpoly lat_google lon_google using "${boundaries_dir}/ct_boundary_coor.dta"
merge m:1 _ID using "${boundaries_dir}/ct_boundary_data_simple.dta", keep(master match) nogenerate 
drop _ID 
rename GEOID fips_tract_10
geoinpoly lat_google lon_google using "${boundaries_dir}/us_2020_tract_coor"
merge m:1 _ID using "${boundaries_dir}/us_2020_tract_data", keep(master match) keepusing(GEOID) nogenerate 
drop _ID 
rename GEOID fips_tract_20
destring fips_tract_20, replace 
save  "${processed_dir}/CA_station_prison_address_google_geocoded_w_tract.dta", replace 

local yr = 2020
import delimited "${weather_dir}/nanda_PRISM_daily_Tract_`yr'_01P.csv", clear
keep month day
sort month day 
drop if month == month[_n-1] & day==day[_n-1]
save "${processed_dir}/weather/day_list_tmp.dta", replace 

set more off
forval yr = 2017/2019{
	import delimited "${weather_dir}/nanda_PRISM_daily_Tract_`yr'_01P.csv", clear

	keep if floor(tract_fips10 / 1000000000) == 6 // keep CA only 
	gen year = `yr'
	replace tmax = tmax / 100
	replace tmin = tmin / 100
	rename tract_fips10 fips_tract 
	
	compress 

	save "${processed_dir}/weather/nanda_PRISM_daily_Tract_`yr'_01P_ca.dta", replace 
}

local yr = 2020
import delimited "${weather_dir}/nanda_PRISM_daily_Tract_`yr'_01P.csv", clear

keep if floor(tract_fips / 1000000000) == 6 // keep CA only 
gen year = `yr'
replace tmax = tmax / 100
replace tmin = tmin / 100
compress 
rename tract_fips fips_tract 
save "${processed_dir}/weather/nanda_PRISM_daily_Tract_`yr'_01P_ca.dta", replace 

forval yr = 2017/2019{	

	use "${processed_dir}/CA_station_prison_address_google_geocoded_w_tract.dta", clear 
	keep FID fips_tract_10 
	rename fips_tract_10 fips_tract
	cross using "${processed_dir}/weather/day_list_tmp.dta"
	if `yr' != 2020{
		drop if day == 29 & month == 2
	}
	gen year = `yr'
	gen date = mdy(month, day, year)
	format date %td
	merge m:1 fips_tract month day using "${processed_dir}/weather/nanda_PRISM_daily_Tract_`yr'_01P_ca.dta", keep(master match) nogenerate 
	save "${processed_dir}/weather/daily_weather_ca_prisons_tract_`yr'.dta", replace 

}


use "${processed_dir}/CA_station_prison_address_google_geocoded_w_tract.dta", clear 
keep FID fips_tract_20 
cross using "${processed_dir}/weather/day_list_tmp.dta"
drop if day == 29 & month == 2
gen year = 2020
gen date = mdy(month, day, year)
format date %td
rename fips_tract_20 fips_tract 
merge m:1 fips_tract month day using "${processed_dir}/weather/nanda_PRISM_daily_Tract_2020_01P_ca.dta", keep(master match) nogenerate 
save "${processed_dir}/weather/daily_weather_ca_prisons_tract_2020.dta", replace 

 
forval yr = 2017/2020{	
	use "${processed_dir}/weather/daily_weather_ca_prisons_tract_`yr'.dta", clear 

	gen week = week(date)
	foreach var of varlist tmax tmin ppt{
		gegen week_m_`var' = mean(`var'), by(FID week)
	}
	sort FID week 
	drop if FID == FID[_n-1] & week == week[_n-1]

	keep FID week year week_*
	rename FID Shapefile_ID // FID == Shapefile_ID

	save  "${processed_dir}/weather/weekly_weather_ca_prisons_tract_`yr'.dta", replace 
}




****************************************************************
* Get all raster-level temperature data (tmax, tmin, tmean, ppt)
****************************************************************
set more off 
forval year = 2017/2020{
	
	use "${weather_dir}/monthly_tmean_ca_prisons_`year'.dta", clear 
	merge 1:1 FID month using "${weather_dir}/monthly_tmax_ca_prisons_`year'.dta", nogenerate
	merge 1:1 FID month using "${weather_dir}/monthly_ppt_ca_prisons_`year'.dta", nogenerate

	drop index 
	
	gen year = `year'

	save "${weather_dir}/monthly_temp_info_ca_prisons_`year'.dta", replace 

}

use "${weather_dir}/monthly_temp_info_ca_prisons_2017.dta", clear 
append using "${weather_dir}/monthly_temp_info_ca_prisons_2018.dta"
append using "${weather_dir}/monthly_temp_info_ca_prisons_2019.dta"
append using "${weather_dir}/monthly_temp_info_ca_prisons_2020.dta"
// merge to get prison name (formatted differently), FID (to merge later) and address
merge m:1 FID using "${data_raw}/CA_station_prison_address.dta", ///
	keepusing(NAME) nogenerate
destring month, replace 
sort FID year month
save "${processed_dir}/weather/monthly_temp_info_ca_prisons_2017_2018_2019_2020.dta", replace 


*************************************************************************
*Get CA wirefire data
*************************************************************************

// shp2dta using "${fire_dir}/California_Fire_Perimeters_right_projection.shp", ///
// 	database("${fire_dir}/California_Fire_Perimeters_data") ///
// 	coordinates("${fire_dir}/California_Fire_Perimeters_coor") replace 

** get only sample between 2017-2020
use "${fire_dir}/California_Fire_Perimeters_data", clear
tostring ALARM_DATE CONT_DATE, replace 

gen alarm_month = substr(ALARM_DATE, 5, 2)
gen alarm_day = substr(ALARM_DATE, 7, 2)
gen alarm_year = substr(ALARM_DATE, 1, 4)

gen cont_month = substr(CONT_DATE, 5, 2)
gen cont_day = substr(CONT_DATE, 7, 2)
gen cont_year = substr(CONT_DATE, 1, 4)

destring alarm_month alarm_day cont_month cont_day alarm_year cont_year, replace 
gen alarm_date = mdy(alarm_month, alarm_day, alarm_year)
gen cont_date = mdy(cont_month, cont_day, cont_year)

format alarm_date cont_date %td

gen byte to_keep = (YEAR_ == 2019)
replace to_keep = 1 if (YEAR_ == 2018) & alarm_month >= 3
replace to_keep = 1 if (YEAR_ == 2017) & (alarm_month >= 2 & alarm_month<= 11)
replace to_keep = 1 if (YEAR_ == 2020) &  alarm_month <= 3
keep if to_keep == 1
drop to_keep 

save "${processed_dir}/fire/California_Fire_Perimeters_data_2017_2020.dta", replace 


use "${fire_dir}/California_Fire_Perimeters_coor", clear 
gen obs = _n 
merge m:1 _ID using "${processed_dir}/fire/California_Fire_Perimeters_data_2017_2020.dta", keepusing(_ID)
keep if _merge == 3 
drop _merge
save "${processed_dir}/fire/California_Fire_Perimeters_coor_2017_2020", replace 

use "${boundaries_dir}/us_county_data.dta", clear 
keep if STATEFP == "06"
save "${processed_dir}/ca_county_data.dta", replace  

// obtain cty-level fire incidents
use "${boundaries_dir}/us_county_coord.dta", clear 
gen obs = _n 
merge m:1 _ID using "${processed_dir}/ca_county_data.dta", keepusing(_ID GEOID)
keep if _merge == 3 
drop _merge

rename GEOID fips 
rename _X cty_lon
rename _Y cty_lat
destring fips, replace 
drop _ID

geoinpoly cty_lat cty_lon using "${processed_dir}/fire/California_Fire_Perimeters_coor_2017_2020"

drop if missing(_ID)

sort fips _ID 
drop if fips == fips[_n-1] & _ID ==_ID[_n-1]

keep fips _ID  
merge m:1 _ID using "${processed_dir}/fire/California_Fire_Perimeters_data_2017_2020.dta", keepusing(_ID alarm_date cont_date ALARM_DATE CONT_DATE INC_NUM FIRE_NAME)
keep if _merge == 3 
drop _merge 

save "${processed_dir}/fire/county_fire_incidents_2017_2020.dta", replace 


use "${processed_dir}/fire/county_fire_incidents_2017_2020.dta", clear 

// assert cont_date >= alarm_date 
replace cont_date = alarm_date if CONT_DATE == "18991230" // unknown date 

gen start_m = mofd(alarm_date)
gen end_m   = mofd(cont_date)
format start_m end_m %tm

gen nmonths = end_m - start_m + 1

expand nmonths
bysort fips _ID: gen month = start_m + _n - 1
format month %tm
drop nmonths start_m end_m

gen year  = yofd(dofm(month))
gen month_num = month(dofm(month))

keep fips year month_num
sort fips year month_num
drop if fips==fips[_n-1] & year==year[_n-1] & month_num==month_num[_n-1]
rename month_num month 

gen byte wildfire = 1 

save "${processed_dir}/fire/county_wirefire_incident_2017_2020_for_merge.dta", replace 

************************************************************************
** prep macroecon data
************************************************************************

* CA unemployment rate (caur)
import delimited "${macroecon_data}/CAUR.csv", clear bindquotes(strict) 

gen year = substr(observation_date, 1, 4)
gen month = substr(observation_date, 6, 2)

destring year month, replace 

rename caur ca_unemployment_rate

keep year month ca_unemployment_rate

save "${processed_dir}/macroecon/ca_unemployment_rate.dta", replace 


* S&P 500

import delimited "${macroecon_data}/SP500.csv", clear

gen year = substr(observation_date, 1, 4)
gen month = substr(observation_date, 6, 2)

destring year month, replace 

gegen sp500_month = mean(sp500), by(year month)

sort year month observation_date
drop if year==year[_n-1] & month==month[_n-1]

keep year month sp500_month 
rename sp500_month sp500

sort year month 
gen log_sp500 = log(sp500)
gen d_log_sp500 = log_sp500 - log_sp500[_n-1]

save "${processed_dir}/macroecon/sp500.dta", replace 

** date 
import delimited "${macroecon_data}/SP500.csv", clear

gen year = substr(observation_date, 1, 4)
gen month = substr(observation_date, 6, 2)
destring year month, replace

gen date = date(observation_date, "YMD")
format date %td

gen week = week(date)

gegen sp500_week = mean(sp500), by(year week)
gegen week_start_date = min(date), by(year week)

sort year week
drop if year==year[_n-1] & week==week[_n-1]

keep year week sp500_week week_start_date 

sort year week 
gen log_sp500 = log(sp500_week)
gen d_log_sp500 = log_sp500 - log_sp500[_n-1]

save "${processed_dir}/macroecon/sp500_weekly.dta", replace 



	
*************************************************************************
* FINAL MERGE: Get monthly visit analysis data and run regression
*************************************************************************
* append incident data from 2017-2020
use "${processed_dir}/cdcr_compstat_incident_2017.dta", clear
append using "${processed_dir}/cdcr_compstat_incident_2018.dta"
append using "${processed_dir}/cdcr_compstat_incident_2019.dta"
append using "${processed_dir}/cdcr_compstat_incident_2020.dta"

* merge w/ gas price data
merge m:1 mid_date using "${processed_dir}/GasPrices_MonthlyCaliforniaAllGrades.dta", keepusing(monthly_gas_price lag_monthly_gas_price) keep(master match) nogenerate 

* obtain fips for prisons
replace institution = "VSP" if institution == "VSPW"
replace institution = "FOL" if institution == "FSP"
rename institution acronym
merge m:1 acronym using "${data_raw}/cdcr-population-data-master/data/cdcr_population_w_ShapefileID.dta", keepusing(COUNTYFIPS prison_ZCTA NAME POPULATION population_felons_201912) keep(master match) nogenerate 
rename acronym institution
rename COUNTYFIPS fips 
replace institution = "FSP" if institution == "FOL"

* merge to get visit info
sort NAME month
merge 1:1 NAME year month using "${processed_dir}/phones/monthly_prisons_visit_multi_year.dta", keep(master match) nogenerate

* merge to get wildfire dummy
merge m:1 fips year month using "${processed_dir}/fire/county_wirefire_incident_2017_2020_for_merge.dta", keepusing(wildfire) keep(master match) nogenerate
replace wildfire = 0 if wildfire == .

* merge data of average distance (between prison and visitor's zip codes)
merge m:1 NAME using "${processed_dir}/wavg_dist_zcta_to_prison.dta", ///
 nogenerate keepusing(wavg_dist_zcta_to_prison dist_rank dist_quartile) keep(master match)
 
* merge w/ temperature data 
merge 1:1 NAME year month using "${processed_dir}/weather/monthly_temp_info_ca_prisons_2017_2018_2019_2020.dta", keep(master match) nogenerate

* merge w/ macroecon data 
merge m:1 year month using "${processed_dir}/macroecon/ca_unemployment_rate.dta", keep(master match) nogenerate
merge m:1 year month using "${processed_dir}/macroecon/sp500.dta", keep(master match) nogenerate

* generate variables 
gen tmean_sq = tmean ^ 2
gen tmax_sq = tmax ^ 2

// manually calculate incidents_per100 for 2020
replace incidents_per100 = incidents / population_felons_201912 * 100 if year == 2020
replace uof_per100 = doc_uof / population_felons_201912 * 100 if year == 2020

//label variable incidents_per100 "Incident per 100"
// label variable uof_per100 "Use of Force per 100"

// log transformation
local vars "monthly_gas_price norm_visits_new norm_visits_repeat norm_visits_night norm_visits norm_visits_7to15 norm_visits_7to15_1hr sick_leave_hours overtime_hours inmate_count"
foreach var of local vars{
	summarize  `var'
	local min = `=r(min)'
	display "`min'"
	gen log_`var' = log(`var')
}

// compute lead- lag variables 
sort institution year month
forval i = 5(-1)1{
	gen log_gas_price_pre`i' = log_monthly_gas_price[_n-`i'] if institution[_n-`i'] == institution[_n]
}
forval i = 1/3{
	gen log_gas_price_post`i' = log_monthly_gas_price[_n+`i'] if institution[_n+`i'] == institution[_n]
}

* get cost of travel index
gen ca_priceXdist = monthly_gas_price * wavg_dist_zcta_to_prison
gen log_ca_priceXdist = log_monthly_gas_price * wavg_dist_zcta_to_prison
gen date = mdy(month, 1, year)
format date %td

* label variables 
label variable log_monthly_gas_price "Log Monthly Gas Price"
label variable log_norm_visits              "Log Normalized Visits (All Presence)"
label variable log_norm_visits_7to15        "Log Normalized Visits"
label variable log_norm_visits_7to15_1hr    "Log Normalized Visits (7am–3pm, $\geq$1 Hour)"


label variable wavg_dist_zcta_to_prison "Weighted Distance to Prisoner Zip Codes"
label variable monthly_gas_price "Monthly CA Gas Price"

label variable tmean "Prison Monthly Avg Temperature"
label variable tmean_sq "Prison Monthly Avg Temperature$^2$"
label variable tmax "Prison Monthly Max Temperature"
label variable tmax_sq "Prison Monthly Max Temperature$^2$"
label variable ppt "Prison Monthly Precipitation"
//label variable mon_aqi "Air Quality Index"

label variable appeals_per100 "Appeals per 100"
label variable cell_seizures "Cell Seizures"
label variable doc_uof "Documented Use of Force"
label variable incidents "Incidents"
label variable incidents_per100 "Incident per 100"
label variable uof_per100 "Use of Force per 100"
label variable battery "Battery"
label variable agg_battery "Aggravated Battery"
label variable assault "Assault"
label variable drug "Drug possession"
label variable incidents_pop "Population (Incidents)"
label variable uof_pop "Population (UoF)"
label variable assault_on_inmate "Assault on Inmate"
label variable assault_on_offi_nonpri "Assault on Officer (Non-Prisoner)"
label variable battery_on_inmate "Battery on Inmate"
label variable battery_on_offi_nonpri "Battery on Officer (Non-Prisoner)"
label variable cell_seizures "Cell Seizures"
label variable count_csw_items "Count of Contraband Items"
label variable disciplinaries_cellphone "Disciplinaries: Cellphone"
label variable disciplinaries_fighting "Disciplinaries: Fighting"
label variable disciplinaries_per100 "Disciplinaries per 100"
label variable log_ca_priceXdist "Log(Monthly Gas Price) X Avg Distance to Prison"
label variable placement_csw "Placement of Contraband/CSW"
label variable agg_battery_nonpri "Aggravated Battery (Non-Prisoner)"
label variable agg_battery_officer "Aggravated Battery on Officer"

label variable log_gas_price_pre5 "Log Gas Price (-5)"
label variable log_gas_price_pre4 "Log Gas Price (-4)"
label variable log_gas_price_pre3 "Log Gas Price (-3)"
label variable log_gas_price_pre2 "Log Gas Price (-2)"
label variable log_gas_price_pre1 "Log Gas Price (-1)"
label variable log_gas_price_post1 "Log Gas Price (+1)"
label variable log_gas_price_post2 "Log Gas Price (+2)"
label variable log_gas_price_post3 "Log Gas Price (+3)"

label variable ca_unemployment_rate "CA Unemployment Rate"
label variable sp500 "SP500"
label variable log_sp500 "Log SP500"
label variable log_overtime_hours "Log Overtime Hours"
label variable log_sick_leave_hours "Log Sick Leave Hours"
label variable wildfire "Wildfire"

save "${processed_dir}/CA_incidents_visit_reg_data.dta", replace 
	
	
******************************************************************************************
* Prepare Prison X Week Level Analaysis Data
******************************************************************************************
* for 2018 - week 52 has two observations: Dec 24 & 31- take the average of the two 	
use "${processed_dir}/GasPrices_WeeklyCaliforniaAllGrades_2018.dta", clear
gegen m_gas_price = mean(gas_price), by(week)
drop if end_date == mdy(12, 24, 2018)
drop gas_price 
rename m_gas_price gas_price
save "${processed_dir}/GasPrices_WeeklyCaliforniaAllGrades_2018_uniq.dta", replace 

************************
set more off
use "${processed_dir}/weather/weekly_weather_ca_prisons_tract_2017.dta", clear 
forval yr = 2018/2020{
	append using "${processed_dir}/weather/weekly_weather_ca_prisons_tract_`yr'.dta"
}
save "${processed_dir}/weather/weekly_weather_ca_prisons_tract.dta", replace 

use "${processed_dir}/GasPrices_WeeklyCaliforniaAllGrades_2017.dta", clear 
append using "${processed_dir}/GasPrices_WeeklyCaliforniaAllGrades_2018_uniq.dta"
forval yr = 2019/2020{
	append using "${processed_dir}/GasPrices_WeeklyCaliforniaAllGrades_`yr'.dta"
}
save "${processed_dir}/GasPrices_WeeklyCaliforniaAllGrades.dta", replace 


** APPEND all data
use "${processed_dir}/phones/weekly_state_prisons_visits_2017_2020.dta", clear
drop if week <=4 & year == 2017 // incomplete data 
drop if week >=49 & year == 2017
drop if week <=8 & year == 2018 // incomplete data 
drop if week >= 14 & year == 2020 // incomplete data

merge m:1 year Shapefile_ID week using "${processed_dir}/weather/weekly_weather_ca_prisons_tract.dta", keep(master match) nogenerate
rename (week_m_*) (*)
merge m:1 year week using "${processed_dir}/GasPrices_WeeklyCaliforniaAllGrades.dta", keep(master match) nogenerate 
merge m:1 year month using "${processed_dir}/phones/CA_residents_count_2017_2020.dta", keep(master match) nogenerate 
merge m:1 year week using "${processed_dir}/macroecon/sp500_weekly.dta", keep(master match) nogenerate
merge m:1 year month using "${processed_dir}/macroecon/ca_unemployment_rate", keep(master match) nogenerate

gen norm_visits = uniq_phones / obs * 1000000
gen start_date = end_date - 7
format start_date %td

local vars "new_year mlk_day presidents_day good_friday memorial_day junteenth fourth_of_july labor_day veterans_day thanksgiving christmas"
foreach var of local vars{
	gen `var' = 0
}

replace new_year        = 1 if (mdy(1,2,2017)  >= start_date & mdy(1,2,2017)  < end_date) // 1.2 is public holiday for 2017
replace mlk_day         = 1 if (mdy(1,16,2017) >= start_date & mdy(1,16,2017) < end_date)
replace presidents_day  = 1 if (mdy(2,20,2017) >= start_date & mdy(2,20,2017) < end_date)
replace good_friday     = 1 if (mdy(4,15,2017) >= start_date & mdy(4,15,2017) < end_date)
replace memorial_day    = 1 if (mdy(5,29,2017) >= start_date & mdy(5,29,2017) < end_date)
replace junteenth       = 1 if (mdy(6,19,2017) >= start_date & mdy(6,19,2017) < end_date)
replace fourth_of_july  = 1 if (mdy(7,4,2017)  >= start_date & mdy(7,4,2017)  < end_date)
replace labor_day       = 1 if (mdy(9,4,2017)  >= start_date & mdy(9,4,2017)  < end_date)
replace veterans_day    = 1 if (mdy(11,10,2017)>= start_date & mdy(11,10,2017)< end_date)
replace thanksgiving    = 1 if (mdy(11,23,2017)>= start_date & mdy(11,23,2017)< end_date)
replace christmas       = 1 if (mdy(12,25,2017)>= start_date & mdy(12,25,2017)< end_date)

replace new_year        = 1 if (mdy(1,1,2018)  >= start_date & mdy(1,1,2018)  < end_date) 
replace mlk_day         = 1 if (mdy(1,15,2018) >= start_date & mdy(1,15,2018) < end_date)
replace presidents_day  = 1 if (mdy(2,19,2018) >= start_date & mdy(2,19,2018) < end_date)
replace good_friday     = 1 if (mdy(3,30,2018) >= start_date & mdy(3,30,2018) < end_date)
replace memorial_day    = 1 if (mdy(5,28,2018) >= start_date & mdy(5,28,2018) < end_date)
replace junteenth       = 1 if (mdy(6,19,2018) >= start_date & mdy(6,19,2018) < end_date)
replace fourth_of_july  = 1 if (mdy(7,4,2018)  >= start_date & mdy(7,4,2018)  < end_date)
replace labor_day       = 1 if (mdy(9,3,2018)  >= start_date & mdy(9,3,2018)  < end_date)
//replace election_day    = 1 if (mdy(11,6,2018)  >= start_date & mdy(11,6,2018)  < end_date)
replace veterans_day    = 1 if (mdy(11,12,2018)>= start_date & mdy(11,12,2018)< end_date)
replace thanksgiving    = 1 if (mdy(11,22,2018)>= start_date & mdy(11,22,2018)< end_date)
replace christmas       = 1 if (mdy(12,25,2018)>= start_date & mdy(12,25,2018)< end_date)

replace new_year        = 1 if (mdy(1,1,2019)  >= start_date & mdy(1,1,2019)  < end_date) 
replace mlk_day         = 1 if (mdy(1,21,2019) >= start_date & mdy(1,21,2019) < end_date)
replace presidents_day  = 1 if (mdy(2,18,2019) >= start_date & mdy(2,18,2019) < end_date)
replace good_friday     = 1 if (mdy(4,19,2019) >= start_date & mdy(4,19,2019) < end_date)
replace memorial_day    = 1 if (mdy(5,27,2019) >= start_date & mdy(5,27,2019) < end_date)
replace junteenth       = 1 if (mdy(6,19,2019) >= start_date & mdy(6,19,2019) < end_date)
replace fourth_of_july  = 1 if (mdy(7,4,2019)  >= start_date & mdy(7,4,2019)  < end_date)
replace labor_day       = 1 if (mdy(9,2,2019)  >= start_date & mdy(9,2,2019)  < end_date)
replace veterans_day    = 1 if (mdy(11,11,2019)>= start_date & mdy(11,11,2019)< end_date)
replace thanksgiving    = 1 if (mdy(11,28,2019)>= start_date & mdy(11,28,2019)< end_date)
replace christmas       = 1 if (mdy(12,25,2019)>= start_date & mdy(12,25,2019)< end_date)

replace new_year        = 1 if (mdy(1,1,2020)  >= start_date & mdy(1,1,2020)  < end_date) 
replace mlk_day         = 1 if (mdy(1,20,2020) >= start_date & mdy(1,20,2020) < end_date)
replace presidents_day  = 1 if (mdy(2,17,2020) >= start_date & mdy(2,17,2020) < end_date)
replace good_friday     = 1 if (mdy(4,10,2020) >= start_date & mdy(4,10,2020) < end_date)
replace memorial_day    = 1 if (mdy(5,25,2020) >= start_date & mdy(5,25,2020) < end_date)
replace junteenth       = 1 if (mdy(6,19,2020) >= start_date & mdy(6,19,2020) < end_date)
replace fourth_of_july  = 1 if (mdy(7,3,2020)  >= start_date & mdy(7,3,2020)  < end_date)
replace labor_day       = 1 if (mdy(9,7,2020)  >= start_date & mdy(9,7,2020)  < end_date)
replace veterans_day    = 1 if (mdy(11,11,2020)>= start_date & mdy(11,11,2020)< end_date)
replace thanksgiving    = 1 if (mdy(11,26,2020)>= start_date & mdy(11,26,2020)< end_date)
replace christmas       = 1 if (mdy(12,25,2020)>= start_date & mdy(12,25,2020)< end_date)
	
egen byte public_holiday = rowmax(new_year mlk_day presidents_day good_friday memorial_day junteenth fourth_of_july labor_day veterans_day thanksgiving christmas)

sort NAME year week 
gen lag_gas_price = gas_price[_n-1] if week == week[_n-1] + 1 & year == year[_n-1] & NAME == NAME[_n-1]

	
local vars "uniq_phones avg_income_1k norm_visits gas_price lag_gas_price"
foreach var of local vars{
	gen log_`var' = log(`var')
}

label variable avg_income_1k "Median Household Income (1K)"
label variable avg_pct_hisp "\% Hispanics"
label variable avg_pct_white "\% White"
label variable avg_pct_black"\% Black"
label variable avg_pct_asian "\% Asian"
label variable avg_pct_college "\% College"

label variable avg_income_1k_vh "Median Household Income (1K) (Phones during 7am-3pm)"
label variable avg_pct_hisp_vh "\% Hispanics (Phones during 7am-3pm)"
label variable avg_pct_white_vh "\% White (Phones during 7am-3pm)"
label variable avg_pct_black_vh "\% Black (Phones during 7am-3pm)"
label variable avg_pct_asian_vh "\% Asian (Phones during 7am-3pm)"
label variable avg_pct_college_vh "\% College (Phones during 7am-3pm)"

label variable public_holiday "Public Holiday Dummy"
label variable tmax "Max. Temperature"
label variable tmin "Min. Temperature"
label variable ppt "Precipitation"
label variable log_sp500 "Log(SP500)"
label variable ca_unemployment_rate "CA Unemployment Rate"

compress
save "${processed_dir}/weekly_state_prisons_analysis_data_2017_2020.dta", replace
